import pandas as pd
import numpy as np
from pathlib import Path
import os
import sklearn
os.makedirs("html-views", exist_ok=True)
import plotly.express as px
pd.options.plotting.backend = 'plotly'
from dsc80_utils import * # Feel free to uncomment and use this.
# Helper Function to Save Plotly Figures
def save_plot(fig, name):
path = f"assets/{name}.html"
fig.write_html(path, include_plotlyjs="cdn")
print(f"Saved to {path}")
Step 1: Introduction¶
# QUESTION:
# How do climate region and the categorical cause of an outage influence the
# duration of major power outages?
Step 2: Data Cleaning and Exploratory Data Analysis¶
Import Data¶
# Import data
df = pd.read_csv("data/outage_cleaned_csv.csv")
df
| OBS | YEAR | MONTH | U.S._STATE | ... | AREAPCT_UC | PCT_LAND | PCT_WATER_TOT | PCT_WATER_INLAND | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2011 | 7.0 | Minnesota | ... | 0.60 | 91.59 | 8.41 | 5.48 |
| 1 | 2 | 2014 | 5.0 | Minnesota | ... | 0.60 | 91.59 | 8.41 | 5.48 |
| 2 | 3 | 2010 | 10.0 | Minnesota | ... | 0.60 | 91.59 | 8.41 | 5.48 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1531 | 1532 | 2009 | 8.0 | South Dakota | ... | 0.15 | 98.31 | 1.69 | 1.69 |
| 1532 | 1533 | 2009 | 8.0 | South Dakota | ... | 0.15 | 98.31 | 1.69 | 1.69 |
| 1533 | 1534 | 2000 | NaN | Alaska | ... | 0.02 | 85.76 | 14.24 | 2.90 |
1534 rows × 56 columns
Fix Column Names¶
# Fix column names
new_cols = []
for col in df.columns:
col = col.replace('U.S.','US')
col = col.lower().split('.')
if (len(col) > 1):
col = ' '.join(col)
else:
col = col[0]
col = col.replace('_', ' ')
new_cols.append(col)
df.columns = new_cols
df.columns
Index(['obs', 'year', 'month', 'us state', 'postal code', 'nerc region',
'climate region', 'anomaly level', 'climate category',
'outage start date', 'outage start time', 'outage restoration date',
'outage restoration time', 'cause category', 'cause category detail',
'hurricane names', 'outage duration', 'demand loss mw',
'customers affected', 'res price', 'com price', 'ind price',
'total price', 'res sales', 'com sales', 'ind sales', 'total sales',
'res percen', 'com percen', 'ind percen', 'res customers',
'com customers', 'ind customers', 'total customers', 'res cust pct',
'com cust pct', 'ind cust pct', 'pc realgsp state', 'pc realgsp usa',
'pc realgsp rel', 'pc realgsp change', 'util realgsp', 'total realgsp',
'util contri', 'pi util ofusa', 'population', 'poppct urban',
'poppct uc', 'popden urban', 'popden uc', 'popden rural',
'areapct urban', 'areapct uc', 'pct land', 'pct water tot',
'pct water inland'],
dtype='object')
Drop All Unnecessary Data¶
# Drop all unnecessary columns
df = df[['year', 'month', 'us state', 'nerc region', 'climate region',
'climate category', 'outage start date', 'outage start time', 'outage restoration date',
'outage restoration time', 'cause category', 'outage duration']]
df
| year | month | us state | nerc region | ... | outage restoration date | outage restoration time | cause category | outage duration | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011 | 7.0 | Minnesota | MRO | ... | Sunday, July 3, 2011 | 8:00:00 PM | severe weather | 3060.0 |
| 1 | 2014 | 5.0 | Minnesota | MRO | ... | Sunday, May 11, 2014 | 6:39:00 PM | intentional attack | 1.0 |
| 2 | 2010 | 10.0 | Minnesota | MRO | ... | Thursday, October 28, 2010 | 10:00:00 PM | severe weather | 3000.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1531 | 2009 | 8.0 | South Dakota | RFC | ... | Saturday, August 29, 2009 | 11:53:00 PM | islanding | 59.0 |
| 1532 | 2009 | 8.0 | South Dakota | MRO | ... | Saturday, August 29, 2009 | 2:01:00 PM | islanding | 181.0 |
| 1533 | 2000 | NaN | Alaska | ASCC | ... | NaN | NaN | equipment failure | NaN |
1534 rows × 12 columns
Data Manipulation¶
# Merge date and time columns to make one cohesive timestamp column
temp = pd.DataFrame()
def convert_date(series):
return pd.to_datetime(series, format="%A, %B %d, %Y").dt.strftime("%Y-%m-%d")
temp['formatted outage start date'] = convert_date(df['outage start date'])
temp['formatted outage restoration date'] = convert_date(df['outage restoration date'])
def convert_time(series):
return pd.to_datetime(series, format="%I:%M:%S %p").dt.strftime("%H:%M:%S")
temp['formatted outage start time'] = convert_time(df['outage start time'])
temp['formatted outage restoration time'] = convert_time(df['outage restoration time'])
df['outage start datetime'] = pd.to_datetime(temp['formatted outage start date'] + " " + temp['formatted outage start time'])
df['outage restoration datetime'] = pd.to_datetime(temp['formatted outage restoration date'] + " " + temp['formatted outage restoration time'])
df[['outage start datetime', 'outage restoration datetime']]
| outage start datetime | outage restoration datetime | |
|---|---|---|
| 0 | 2011-07-01 17:00:00 | 2011-07-03 20:00:00 |
| 1 | 2014-05-11 18:38:00 | 2014-05-11 18:39:00 |
| 2 | 2010-10-26 20:00:00 | 2010-10-28 22:00:00 |
| ... | ... | ... |
| 1531 | 2009-08-29 22:54:00 | 2009-08-29 23:53:00 |
| 1532 | 2009-08-29 11:00:00 | 2009-08-29 14:01:00 |
| 1533 | NaT | NaT |
1534 rows × 2 columns
# Remove object type date and time columns
df = df.drop(['outage start date', 'outage restoration date', 'outage start time', 'outage restoration time'], axis=1)
df
| year | month | us state | nerc region | ... | cause category | outage duration | outage start datetime | outage restoration datetime | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011 | 7.0 | Minnesota | MRO | ... | severe weather | 3060.0 | 2011-07-01 17:00:00 | 2011-07-03 20:00:00 |
| 1 | 2014 | 5.0 | Minnesota | MRO | ... | intentional attack | 1.0 | 2014-05-11 18:38:00 | 2014-05-11 18:39:00 |
| 2 | 2010 | 10.0 | Minnesota | MRO | ... | severe weather | 3000.0 | 2010-10-26 20:00:00 | 2010-10-28 22:00:00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1531 | 2009 | 8.0 | South Dakota | RFC | ... | islanding | 59.0 | 2009-08-29 22:54:00 | 2009-08-29 23:53:00 |
| 1532 | 2009 | 8.0 | South Dakota | MRO | ... | islanding | 181.0 | 2009-08-29 11:00:00 | 2009-08-29 14:01:00 |
| 1533 | 2000 | NaN | Alaska | ASCC | ... | equipment failure | NaN | NaT | NaT |
1534 rows × 10 columns
# Replace all 0s in numeric columns with np.nan
for col in df.columns:
if pd.api.types.is_numeric_dtype(df[col]):
df[col] = df[col].replace(0, np.nan)
df
| year | month | us state | nerc region | ... | cause category | outage duration | outage start datetime | outage restoration datetime | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011 | 7.0 | Minnesota | MRO | ... | severe weather | 3060.0 | 2011-07-01 17:00:00 | 2011-07-03 20:00:00 |
| 1 | 2014 | 5.0 | Minnesota | MRO | ... | intentional attack | 1.0 | 2014-05-11 18:38:00 | 2014-05-11 18:39:00 |
| 2 | 2010 | 10.0 | Minnesota | MRO | ... | severe weather | 3000.0 | 2010-10-26 20:00:00 | 2010-10-28 22:00:00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1531 | 2009 | 8.0 | South Dakota | RFC | ... | islanding | 59.0 | 2009-08-29 22:54:00 | 2009-08-29 23:53:00 |
| 1532 | 2009 | 8.0 | South Dakota | MRO | ... | islanding | 181.0 | 2009-08-29 11:00:00 | 2009-08-29 14:01:00 |
| 1533 | 2000 | NaN | Alaska | ASCC | ... | equipment failure | NaN | NaT | NaT |
1534 rows × 10 columns
# Replace all empty strings in string columns with np.nan
for col in df.columns:
if pd.api.types.is_string_dtype(df[col]):
df[col] = df[col].replace('', np.nan)
df
| year | month | us state | nerc region | ... | cause category | outage duration | outage start datetime | outage restoration datetime | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011 | 7.0 | Minnesota | MRO | ... | severe weather | 3060.0 | 2011-07-01 17:00:00 | 2011-07-03 20:00:00 |
| 1 | 2014 | 5.0 | Minnesota | MRO | ... | intentional attack | 1.0 | 2014-05-11 18:38:00 | 2014-05-11 18:39:00 |
| 2 | 2010 | 10.0 | Minnesota | MRO | ... | severe weather | 3000.0 | 2010-10-26 20:00:00 | 2010-10-28 22:00:00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1531 | 2009 | 8.0 | South Dakota | RFC | ... | islanding | 59.0 | 2009-08-29 22:54:00 | 2009-08-29 23:53:00 |
| 1532 | 2009 | 8.0 | South Dakota | MRO | ... | islanding | 181.0 | 2009-08-29 11:00:00 | 2009-08-29 14:01:00 |
| 1533 | 2000 | NaN | Alaska | ASCC | ... | equipment failure | NaN | NaT | NaT |
1534 rows × 10 columns
# View Columns
df.columns
Index(['year', 'month', 'us state', 'nerc region', 'climate region',
'climate category', 'cause category', 'outage duration',
'outage start datetime', 'outage restoration datetime'],
dtype='object')
# View in HTML
df.to_html("html-views/cleaned_data.html")
# CSV Output to Generate Markdown Table for df.head
print(df.head().to_csv(index=False))
year,month,us state,nerc region,climate region,climate category,cause category,outage duration,outage start datetime,outage restoration datetime 2011,7.0,Minnesota,MRO,East North Central,normal,severe weather,3060.0,2011-07-01 17:00:00,2011-07-03 20:00:00 2014,5.0,Minnesota,MRO,East North Central,normal,intentional attack,1.0,2014-05-11 18:38:00,2014-05-11 18:39:00 2010,10.0,Minnesota,MRO,East North Central,cold,severe weather,3000.0,2010-10-26 20:00:00,2010-10-28 22:00:00 2012,6.0,Minnesota,MRO,East North Central,normal,severe weather,2550.0,2012-06-19 04:30:00,2012-06-20 23:00:00 2015,7.0,Minnesota,MRO,East North Central,warm,severe weather,1740.0,2015-07-18 02:00:00,2015-07-19 07:00:00
Univariate Data Analysis¶
# Histogram for Distribution of Cause Category
fig = px.histogram(
df,
x='cause category',
nbins=20,
title="Distribution of Cause Category",
labels={'cause category':'Cause Category'},
color_discrete_sequence=['skyblue']
)
fig.update_layout(
xaxis_title="Cause Category",
yaxis_title="Frequency",
title_font_size=20,
xaxis_tickfont_size=12,
yaxis_tickfont_size=12
)
fig.update_layout(
width=800,
height=500,
)
fig.update_layout(
margin=dict(l=50, r=50, t=50, b=50)
)
fig.show()
save_plot(fig, "outages-by-cause-category")
Saved to assets/outages-by-cause-category.html
# Pie Chart for Climate Category of Power Outages
climate_category = df['climate category']
cc_counts = climate_category.value_counts().reset_index()
cc_counts.columns = ['Climate Category', 'Count']
fig = px.pie(
cc_counts,
names='Climate Category',
values='Count',
title="Pie Chart of Categories",
color_discrete_sequence=px.colors.qualitative.Pastel
)
fig.update_layout(
width=700,
height=500,
title=dict(
text="Pie Chart for Climate Categories of Power Outages",
font=dict(size=20),
x=0.5
)
)
fig.update_layout(
margin=dict(l=50, r=50, t=50, b=20)
)
fig.show()
# Pie Chart for Climate Region of Power Outages
climate_category = df['climate region']
cc_counts = climate_category.value_counts().reset_index()
cc_counts.columns = ['Climate Region', 'Count']
fig = px.pie(
cc_counts,
names='Climate Region',
values='Count',
title="Pie Chart of Climate Region",
color_discrete_sequence=px.colors.qualitative.Pastel
)
fig.update_layout(
width=700,
height=500,
title=dict(
text="Pie Chart for Climate Region of Power Outages",
font=dict(size=20),
x=0.5
)
)
fig.update_layout(
margin=dict(l=50, r=50, t=50, b=20)
)
fig.show()
save_plot(fig, "outages-by-climate-region")
Saved to assets/outages-by-climate-region.html
# Boxplot for Outage Duration
fig = px.box(
df,
x='outage duration',
points='all',
title="Boxplot of Outage Duration",
color_discrete_sequence=['skyblue']
)
fig.update_layout(
width=700,
height=500,
title=dict(
text="Boxplot of Outage Duration (minutes)",
font=dict(size=20),
x=0.5
),
xaxis_title="Outage Duration (minutes)"
)
fig.update_layout(
width=1500,
height=400,
)
fig.update_layout(
margin=dict(l=50, r=50, t=50, b=50)
)
fig.show()
fig = px.box(
df,
x='outage duration',
points=False,
title="Boxplot of Outage Duration (minutes)",
color_discrete_sequence=['skyblue']
)
fig.update_layout(
width=700,
height=500,
title=dict(
text="Boxplot of Outage Duration (minutes) WITHOUT Points and Outliers",
font=dict(size=20),
x=0.5
),
xaxis_title="Outage Duration (minutes)"
)
fig.update_layout(
width=900,
height=400,
)
fig.update_layout(
margin=dict(l=50, r=50, t=50, b=50)
)
fig.show()
# Line Graph of Counts per Year
year_counts = df['year'].value_counts().sort_index()
fig = px.line(
x=year_counts.index,
y=year_counts.values,
markers=True,
title="Count of Outages By Year"
)
fig.update_layout(
xaxis_title="Year",
yaxis_title="Outages",
width=800,
height=450
)
fig.update_layout(
margin=dict(l=50, r=50, t=50, b=50)
)
fig.show()
save_plot(fig, "outages-by-year")
Saved to assets/outages-by-year.html
# Line Graph of Counts per Hour
hour = df['outage start datetime'].dt.hour
hour_counts = hour.value_counts().sort_index()
fig = px.line(
x=hour_counts.index,
y=hour_counts.values,
markers=True,
title="Count of Outages By Hour of the Day"
)
fig.update_layout(
width=800,
height=450,
xaxis_title="Hour of Day (0–23)",
yaxis_title="Outages",
title=dict(x=0.5)
)
fig.update_layout(
margin=dict(l=50, r=50, t=50, b=50)
)
fig.show()
save_plot(fig, "outages-by-hour")
Saved to assets/outages-by-hour.html
Bivariate Analysis¶
# Scatter Plot of Month (1-12) VS Outage Duration (min)
fig = px.scatter(
df,
x="month",
y="outage duration",
title="Outage Duration (min) by Month (1-12)",
opacity=0.7
)
fig.update_layout(
width=800,
height=500,
xaxis_title="Month (1-12)",
yaxis_title="Outage Duration (min)",
title=dict(x=0.5)
)
fig.update_layout(
margin=dict(l=50, r=50, t=50, b=50)
)
fig.show()
save_plot(fig, "outage-duration-by-month")
Saved to assets/outage-duration-by-month.html
# Scatter Plot of Hour (0-23) VS Outage Duration (min)
temp['hour'] = df['outage start datetime'].dt.hour
temp['outage duration'] = df['outage duration']
fig = px.scatter(
temp,
x="hour",
y="outage duration",
title="Hour (0-23) VS Outage Duration (min)",
opacity=0.7
)
fig.update_layout(
width=800,
height=500,
xaxis_title="Hour (0-23)",
yaxis_title="Outage Duration (min)",
title=dict(x=0.5)
)
fig.update_layout(
margin=dict(l=50, r=50, t=50, b=50)
)
fig.show()
# Scatter Plot of Cause Category VS Outage Duration (min)
import plotly.express as px
fig = px.scatter(
df,
x="cause category",
y="outage duration",
title="Outage Duration (min) by Cause Category"
)
fig.update_layout(
xaxis_title="Cause Category",
yaxis_title="Outage Duration (min)",
width=800,
height=500,
title=dict(x=0.5)
)
fig.update_layout(
margin=dict(l=80, r=50, t=50, b=50)
)
fig.show()
save_plot(fig, "outage-duration-by-cause-category")
Saved to assets/outage-duration-by-cause-category.html
# Stacked Bar Chart of NERC Region VS Power Outages by Cause Category
import plotly.express as px
fig = px.bar(
df,
x="nerc region",
color="cause category",
title="Distribution of Outage Causes by NERC Region",
)
fig.update_layout(
barmode="stack",
xaxis_title="NERC Region",
yaxis_title="Power Outages",
width=900,
height=500,
title=dict(x=0.5)
)
fig.update_layout(
margin=dict(l=80, r=50, t=50, b=50)
)
fig.show()
save_plot(fig, "outage-cause-by-nerc-region")
Saved to assets/outage-cause-by-nerc-region.html
Interesting Aggregates¶
# Median Outage Duration for Climate Region VS Category Cause
pivot = df.pivot_table(
index="climate region",
columns="cause category",
values="outage duration",
aggfunc="median"
)
pivot.to_html("html-views/interesting-agg-pivot.html")
pivot
| cause category | equipment failure | fuel supply emergency | intentional attack | islanding | public appeal | severe weather | system operability disruption |
|---|---|---|---|---|---|---|---|
| climate region | |||||||
| Central | 149.0 | 7500.5 | 198.0 | 96.0 | 1410.0 | 1695.0 | 65.0 |
| East North Central | 761.0 | 13564.0 | 1046.0 | 1.0 | 733.0 | 4005.0 | 2694.0 |
| Northeast | 267.5 | 12240.0 | 30.0 | 881.0 | 2760.0 | 3189.0 | 234.5 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| Southwest | 35.0 | 76.0 | 57.0 | 2.0 | 2275.0 | 2425.0 | 337.5 |
| West | 269.0 | 882.5 | 118.0 | 128.5 | 420.0 | 962.0 | 199.0 |
| West North Central | 61.0 | NaN | 47.0 | 56.0 | 439.5 | 83.0 | NaN |
9 rows × 7 columns
# Average Month for Climate Category VS Cause Category
pivot = df.pivot_table(
index="climate category",
columns="cause category",
values="month",
aggfunc="mean"
)
pivot
| cause category | equipment failure | fuel supply emergency | intentional attack | islanding | public appeal | severe weather | system operability disruption |
|---|---|---|---|---|---|---|---|
| climate category | |||||||
| cold | 5.37 | 4.26 | 5.13 | 7.53 | 6.95 | 5.86 | 5.19 |
| normal | 5.71 | 6.77 | 6.29 | 6.24 | 5.59 | 6.73 | 6.12 |
| warm | 8.30 | 4.60 | 4.89 | 5.79 | 7.15 | 7.40 | 6.43 |
Step 3: Assessment of Missingness¶
Identify Columns with Missing Values¶
# Identify columns with missing values
og_total_rows = df.shape[0]
def check_cols(df):
missing_col_data = {}
for col in df.columns:
missing_col_data[col] = int(df[col].isna().sum())
missing_col_data = {key: val for key, val in missing_col_data.items() if val > 0}
return missing_col_data
check_cols(df)
{'month': 9,
'climate region': 6,
'climate category': 9,
'outage duration': 136,
'outage start datetime': 9,
'outage restoration datetime': 58}
NMAR Analysis¶
The column OUTAGE.RESTORATION.DATE, which contains 58 missing values, can be classified as Not Missing At Random (NMAR) because the missingness could be inherently linked to the nature of the outage event itself. For example, if the power was not restored at the time the data was reported, there would be no date or time to record, which directly ties the missing value to the ongoing status of the event. Similarly, for outages that were minor or very short, restoration details could have been deemed unnecessary to log, which means that the absence of this data is again related to the characteristics of the outage rather than occurring randomly. In both cases, the probability of a missing value is dependent on the unobserved value or the specific circumstances of the event rather than other observed data, making this column likely NMAR.
Missingness Dependency¶
# Set columns and number of permutation tests
target_col = "outage duration"
dependent_col = "cause category"
independent_col = "nerc region"
n_permutations = 1000
# Compute TVD
def tvd_between_groups(df, col, missing_indicator):
counts_missing = df.loc[missing_indicator == 1, col].value_counts(normalize=True)
counts_not_missing = df.loc[missing_indicator == 0, col].value_counts(normalize=True)
all_categories = counts_missing.index.union(counts_not_missing.index)
p = counts_missing.reindex(all_categories, fill_value=0)
q = counts_not_missing.reindex(all_categories, fill_value=0)
return 0.5 * np.sum(np.abs(p - q))
# Permutation test with TVD
def permutation_test_tvd(df, target_col, other_col, n_permutations=1000):
missing_indicator = df[target_col].isna().astype(int)
observed_tvd = tvd_between_groups(df, other_col, missing_indicator)
perm_tvd = []
for _ in range(n_permutations):
shuffled = np.random.permutation(df[other_col])
perm_tvd.append(tvd_between_groups(df.assign(temp=shuffled), 'temp', missing_indicator))
perm_tvd = np.array(perm_tvd)
p_value = (perm_tvd >= observed_tvd).mean() # one-sided test
return observed_tvd, perm_tvd, p_value
Null Hypothesis (H₀): The distribution of cause category when outage duration is missing is the same as when outage duration is not missing.
Alternative Hypothesis (H₁): The distributions are not the same.
# Run permutation test for dependent column
obs_dep, perm_dep, p_dep = permutation_test_tvd(df, target_col, dependent_col, n_permutations)
print(f"permutation test for dependency on {dependent_col}:")
print(f"observed TVD: {obs_dep:.3f}, p-value: {p_dep:.3f}\n")
print('REJECT null hypothesis')
permutation test for dependency on cause category: observed TVD: 0.469, p-value: 0.000 REJECT null hypothesis
# Distribution of Cause Category by Missingness of Outage Duration
df['missing_flag'] = df[target_col].isna()
fig = px.histogram(df, x=dependent_col, color='missing_flag', barmode='group', histnorm='probability',
title="Distribution of Cause Category by Missingness of Outage Duration", width=800, height=600)
fig.update_layout(xaxis_title="Cause Category", yaxis_title="Proportion", legend_title="Missing Duration")
fig.update_layout(margin=dict(l=70, r=50, t=50, b=50))
fig.show()
save_plot(fig, 'missingness-cause-category')
Saved to assets/missingness-cause-category.html
# Empirical Distribution of TVD for Cause Category by Missingness of Outage Duration
perm_df = pd.DataFrame({'perm_tvd': perm_dep})
fig_perm = px.histogram(
perm_df,
x='perm_tvd',
nbins=30,
histnorm='probability',
title="Empirical Distribution of TVD for Cause Category by Missingness of Outage Duration",
width=800,
height=500
)
fig_perm.add_shape(
type="line",
x0=obs_dep, x1=obs_dep,
y0=0, y1=.2,
line=dict(color="red", width=3, dash="dash"),
)
fig_perm.add_annotation(
x=obs_dep,
y=.17,
text=f"Observed TVD = {obs_dep:.3f}",
showarrow=True,
arrowhead=3,
ax=80,
ay=-20
)
fig_perm.update_layout(
xaxis_title="TVD",
yaxis_title="Probability",
)
fig_perm.update_layout(margin=dict(l=70, r=50, t=50, b=50))
fig_perm.show()
save_plot(fig_perm, 'tvd-missingness-cause-category')
Saved to assets/tvd-missingness-cause-category.html
Null Hypothesis (H₀): The distribution of NERC region when outage duration is missing is the same as when outage duration is not missing.
Alternative Hypothesis (H₁): The distributions are not the same.
# Run permutation test for independent column
obs_indep, perm_indep, p_indep = permutation_test_tvd(df, target_col, independent_col, n_permutations)
print(f"permutation test for dependency on {independent_col}:")
print(f"observed TVD: {obs_indep:.3f}, p-value: {p_indep:.3f}\n")
print('FAIL to reject null hypothesis')
permutation test for dependency on nerc region: observed TVD: 0.143, p-value: 0.047 FAIL to reject null hypothesis
# Distribution of Cause Category by Missingness of Outage Duration
df['missing_flag'] = df[target_col].isna()
fig = px.histogram(df, x=independent_col, color='missing_flag', barmode='group', histnorm='probability',
title="Distribution of NERC Region by Missingness of Outage Duration", width=800, height=600)
fig.update_layout(xaxis_title="NERC Region", yaxis_title="Proportion", legend_title="Missing Duration")
fig.update_layout(margin=dict(l=70, r=50, t=50, b=50))
fig.show()
save_plot(fig, 'missingness-nerc-region')
Saved to assets/missingness-nerc-region.html
# Empirical Distribution of TVD for NERC Region by Missingness of Outage Duration
perm_df = pd.DataFrame({'perm_tvd': perm_indep})
fig_perm = px.histogram(
perm_df,
x='perm_tvd',
nbins=30,
histnorm='probability',
title="Empirical Distribution of TVD for NERC Region by Missingness of Outage Duration",
width=800,
height=500
)
fig_perm.add_shape(
type="line",
x0=obs_indep, x1=obs_indep,
y0=0, y1=.2,
line=dict(color="red", width=3, dash="dash"),
)
fig_perm.add_annotation(
x=obs_indep,
y=.17,
text=f"Observed TVD = {obs_indep:.3f}",
showarrow=True,
arrowhead=3,
ax=80,
ay=-20
)
fig_perm.update_layout(
xaxis_title="TVD",
yaxis_title="Probability",
)
fig_perm.update_layout(margin=dict(l=70, r=50, t=50, b=50))
fig_perm.show()
save_plot(fig_perm, 'tvd-missingness-nerc-region')
Saved to assets/tvd-missingness-nerc-region.html
Step 4: Hypothesis Testing¶
Null Hypothesis (H₀): The average outage duration in the South climate region is the same as the average outage duration in the Northeast climate region.
Alternative Hypothesis (H₁): The average outage duration in the South climate region is greater than the average outage duration in the Northeast climate region.
Test Statistic: Difference in mean outage duration between South and Northeast (Mean South − Mean Northeast)
# Extract outages durations for South and Northeast climate regions
durations_south = df.loc[df['climate region'] == 'South', 'outage duration'].dropna().values
durations_northeast = df.loc[df['climate region'] == 'Northeast', 'outage duration'].dropna().values
# Compute observed mean difference
observed_diff = durations_south.mean() - durations_northeast.mean()
# Run permutation test
combined = np.concatenate([durations_south, durations_northeast])
n_south = len(durations_south)
n_permutations = 10000
diff_permutations = []
for _ in range(n_permutations):
np.random.shuffle(combined)
perm_south = combined[:n_south]
perm_northeast = combined[n_south:]
diff_permutations.append(perm_south.mean() - perm_northeast.mean())
diff_permutations = np.array(diff_permutations)
# Compute p-value (mean South - mean Northeast)
p_value = np.mean(diff_permutations >= observed_diff)
print("Observed mean difference:", observed_diff)
print("Permutation test p-value:", p_value)
print('FAIL to reject null hypothesis')
Observed mean difference: -458.064095649047 Permutation test p-value: 0.8328 FAIL to reject null hypothesis
# Plot distribution of mean difference
df_plot = pd.DataFrame({'Mean Difference': diff_permutations})
fig = px.histogram(
df_plot,
x='Mean Difference',
nbins=50,
title='Mean Difference Distribution in Outage Duration (min)',
opacity=0.75,
histnorm='probability',
width=850,
height=600
)
fig.add_vline(
x=observed_diff,
line_color='red',
line_width=3,
annotation_text='Observed Difference',
annotation_position='top left'
)
fig.update_layout(
xaxis_title='Mean Difference',
yaxis_title='Probability',
bargap=0.1
)
fig.update_layout(
margin=dict(l=80, r=50, t=100, b=80)
)
fig.update_layout(
xaxis_title='Mean Difference (South - Northeast)'
)
fig.show()
save_plot(fig, 'hyp-test-fig')
Saved to assets/hyp-test-fig.html
Step 5: Framing a Prediction Problem¶
For this project, I will develop a model that can predict the duration of power outages, making this a regression problem. The response variable is outage duration because understanding and forecasting how long outages last is critical for planning, resource allocation, and improving grid reliability. I chose a regression approach because outage duration is a continuous variable, and we want the model to provide specific numerical predictions rather than classifying outages into discrete categories.
The features I want to use for prediction are climate region, climate category, and cause category. I selected these because they are known at the time of the outage and can influence outage duration, making them suitable for real-time prediction. I will evaluate the model using mean absolute error (MAE), as it provides an interpretable measure of the average prediction error in hours, which is more meaningful in this context than metrics like RMSE that can be overly influenced by extreme values. This setup ensures the model only uses information that would be available at the time of prediction and provides actionable insights for utilities managing outages.
Step 6: Baseline Model¶
Data Manipulation¶
# Check columns for missing values
check_cols(df)
{'month': 9,
'climate region': 6,
'climate category': 9,
'outage duration': 136,
'outage start datetime': 9,
'outage restoration datetime': 58}
# Fix 'month' column
df[df['month'].isna()]
df = df.dropna(subset=['month'])
check_cols(df)
# This fixed 'month' along with 'anomaly level', 'climate category',
# 'outage start date', and 'outage start time'.
{'climate region': 5,
'outage duration': 127,
'outage restoration datetime': 49}
# Fix Outage Restoration Date
df[df['outage duration'].isna()]
df = df.dropna(subset=['outage duration'])
check_cols(df)
# This fixed 'outage restoration date' along with
# 'outage restoration time', and 'outage duration'.
{'climate region': 5}
# Fix Climate Region
df[df['climate region'].isna()]
df = df.dropna(subset=['climate region'])
check_cols(df)
# All missing data fixed.
{}
rows_dropped = og_total_rows - df.shape[0]
pct_rows_dropped = round((rows_dropped / og_total_rows) * 100, 2)
f'Retained approximately {100-pct_rows_dropped}% of original data as only {rows_dropped} of the {og_total_rows} rows were dropped!'
'Retained approximately 90.81% of original data as only 141 of the 1534 rows were dropped!'
Pipeline Building¶
# Imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
# Split X, y train
features = ['month', 'nerc region', 'climate region', 'climate category']
target = 'cause category'
X = df[features]
y = df[target]
# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
# Preprocessing pipeline
categorical_features = ['nerc region', 'climate region', 'climate category']
numeric_features = ['month']
preprocessor = ColumnTransformer([
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
('num', StandardScaler(), numeric_features)
])
# XGBoost pipeline
xgb_model = Pipeline([
('preprocess', preprocessor),
('classifier', XGBClassifier(
objective='multi:softmax',
num_class=len(le.classes_),
n_estimators=500,
learning_rate=0.05,
max_depth=6,
random_state=42
))
])
# Train model
xgb_model.fit(X_train, y_train)
Pipeline(steps=[('preprocess',
ColumnTransformer(transformers=[('cat',
OneHotEncoder(handle_unknown='ignore'),
['nerc region',
'climate region',
'climate category']),
('num', StandardScaler(),
['month'])])),
('classifier',
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None,
ear...
feature_types=None, feature_weights=None,
gamma=None, grow_policy=None,
importance_type=None,
interaction_constraints=None, learning_rate=0.05,
max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None,
max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None,
n_estimators=500, n_jobs=None, num_class=7, ...))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocess',
ColumnTransformer(transformers=[('cat',
OneHotEncoder(handle_unknown='ignore'),
['nerc region',
'climate region',
'climate category']),
('num', StandardScaler(),
['month'])])),
('classifier',
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None,
ear...
feature_types=None, feature_weights=None,
gamma=None, grow_policy=None,
importance_type=None,
interaction_constraints=None, learning_rate=0.05,
max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None,
max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None,
n_estimators=500, n_jobs=None, num_class=7, ...))])ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'),
['nerc region', 'climate region',
'climate category']),
('num', StandardScaler(), ['month'])])['nerc region', 'climate region', 'climate category']
OneHotEncoder(handle_unknown='ignore')
['month']
StandardScaler()
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
feature_weights=None, gamma=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.05, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None, n_estimators=500,
n_jobs=None, num_class=7, ...)# Predict
y_pred_enc = xgb_model.predict(X_test)
y_pred = le.inverse_transform(y_pred_enc)
y_pred
array(['severe weather', 'severe weather', 'intentional attack', ...,
'severe weather', 'severe weather', 'severe weather'], dtype=object)
# Evaluate
print("Accuracy:", accuracy_score(le.inverse_transform(y_test), y_pred))
print("F1-score (macro):", f1_score(le.inverse_transform(y_test), y_pred, average='macro'))
print("\nClassification Report:\n", classification_report(le.inverse_transform(y_test), y_pred))
Accuracy: 0.6021505376344086
F1-score (macro): 0.31539611055637107
Classification Report:
precision recall f1-score support
equipment failure 0.00 0.00 0.00 11
fuel supply emergency 0.33 0.14 0.20 7
intentional attack 0.58 0.52 0.54 66
islanding 0.00 0.00 0.00 9
public appeal 0.56 0.36 0.43 14
severe weather 0.66 0.82 0.73 148
system operability disruption 0.30 0.29 0.30 24
accuracy 0.60 279
macro avg 0.35 0.30 0.32 279
weighted avg 0.55 0.60 0.57 279
c:\Users\nimis\miniforge3\envs\dsc80\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. c:\Users\nimis\miniforge3\envs\dsc80\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. c:\Users\nimis\miniforge3\envs\dsc80\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Step 7: Final Model¶
# Split X, y train
features = ['year', 'month', 'us state', 'nerc region', 'climate region', 'climate category']
target = 'cause category'
X = df[features]
y = df[target]
# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
# Preprocessing pipeline
categorical_features = ['us state', 'nerc region', 'climate region', 'climate category']
numeric_features = ['year', 'month']
preprocessor = ColumnTransformer([
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
('num', StandardScaler(), numeric_features)
])
# XGBoost pipeline
xgb_model = Pipeline([
('preprocess', preprocessor),
('classifier', XGBClassifier(
objective='multi:softmax',
num_class=len(le.classes_),
n_estimators=500,
learning_rate=0.05,
max_depth=6,
random_state=42
))
])
# Hyperparameter tuning using GridSearchCV
from sklearn.model_selection import GridSearchCV
# Define parameter grid for XGBoost inside pipeline
# max_depth = tree depth
# learning rate = how much each tree can impact model
# n estimators = # of trees in the model
param_grid = {
'classifier__max_depth': [5, 6, 7],
'classifier__learning_rate': [0.03, 0.05, 0.07],
'classifier__n_estimators': [400, 500, 600]
}
# Grid Search with 3-fold CV
grid_search = GridSearchCV(
estimator=xgb_model,
param_grid=param_grid,
cv=3,
scoring='f1_macro',
n_jobs=-1,
verbose=2
)
# Fit only on training data
grid_search.fit(X_train, y_train)
# Best params, CV score
print("Best Params:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Params: {'classifier__learning_rate': 0.07, 'classifier__max_depth': 6, 'classifier__n_estimators': 600}
Best CV Score: 0.3993510793503334
# Train model
final_model = grid_search.best_estimator_
final_model.fit(X_train, y_train)
Pipeline(steps=[('preprocess',
ColumnTransformer(transformers=[('cat',
OneHotEncoder(handle_unknown='ignore'),
['us state', 'nerc region',
'climate region',
'climate category']),
('num', StandardScaler(),
['year', 'month'])])),
('classifier',
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=Non...
feature_types=None, feature_weights=None,
gamma=None, grow_policy=None,
importance_type=None,
interaction_constraints=None, learning_rate=0.07,
max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None,
max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None,
n_estimators=600, n_jobs=None, num_class=7, ...))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocess',
ColumnTransformer(transformers=[('cat',
OneHotEncoder(handle_unknown='ignore'),
['us state', 'nerc region',
'climate region',
'climate category']),
('num', StandardScaler(),
['year', 'month'])])),
('classifier',
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=Non...
feature_types=None, feature_weights=None,
gamma=None, grow_policy=None,
importance_type=None,
interaction_constraints=None, learning_rate=0.07,
max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None,
max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None,
n_estimators=600, n_jobs=None, num_class=7, ...))])ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'),
['us state', 'nerc region', 'climate region',
'climate category']),
('num', StandardScaler(), ['year', 'month'])])['us state', 'nerc region', 'climate region', 'climate category']
OneHotEncoder(handle_unknown='ignore')
['year', 'month']
StandardScaler()
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
feature_weights=None, gamma=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.07, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None, n_estimators=600,
n_jobs=None, num_class=7, ...)# Predict
y_pred_enc = final_model.predict(X_test)
y_pred = le.inverse_transform(y_pred_enc)
y_pred
array(['severe weather', 'severe weather', 'intentional attack', ...,
'severe weather', 'equipment failure', 'severe weather'],
dtype=object)
# Evaluate
print("Accuracy:", accuracy_score(le.inverse_transform(y_test), y_pred))
print("F1-score (macro):", f1_score(le.inverse_transform(y_test), y_pred, average='macro'))
print("\nClassification Report:\n", classification_report(le.inverse_transform(y_test), y_pred))
Accuracy: 0.7060931899641577
F1-score (macro): 0.4788951959596007
Classification Report:
precision recall f1-score support
equipment failure 0.25 0.09 0.13 11
fuel supply emergency 0.33 0.29 0.31 7
intentional attack 0.79 0.82 0.81 66
islanding 0.50 0.33 0.40 9
public appeal 0.55 0.79 0.65 14
severe weather 0.78 0.81 0.80 148
system operability disruption 0.27 0.25 0.26 24
accuracy 0.71 279
macro avg 0.50 0.48 0.48 279
weighted avg 0.69 0.71 0.69 279
Step 8: Fairness Analysis¶
# Imports
from sklearn.metrics import precision_score
# Define groups
group_col = 'climate region'
group_X_value = 'South'
group_Y_value = 'Northwest'
# Ensure X_test is a DataFrame, create masks for groups
X_test_df = X_test.copy()
if not isinstance(X_test, pd.DataFrame):
X_test_df = pd.DataFrame(X_test, columns=features)
mask_X = X_test_df[group_col] == group_X_value
mask_Y = X_test_df[group_col] == group_Y_value
# Compute difference in observed precision
y_test_labels = le.inverse_transform(y_test)
y_pred_labels = y_pred
precision_X = precision_score(y_test_labels[mask_X], y_pred_labels[mask_X], average='macro', zero_division=0)
precision_Y = precision_score(y_test_labels[mask_Y], y_pred_labels[mask_Y], average='macro', zero_division=0)
observed_diff = precision_X - precision_Y
print(f"Observed Precision Difference (South - Northwest): {observed_diff:.4f}")
Observed Precision Difference (South - Northwest): -0.2269
# Run permutation test
n_permutations = 10000
combined_mask = np.concatenate([mask_X, mask_Y])
combined_labels = np.concatenate([y_test_labels[mask_X], y_test_labels[mask_Y]])
combined_preds = np.concatenate([y_pred_labels[mask_X], y_pred_labels[mask_Y]])
perm_diffs = []
for _ in range(n_permutations):
perm_indices = np.random.permutation(len(combined_labels))
perm_labels = combined_labels[perm_indices]
perm_preds = combined_preds[perm_indices]
perm_prec_X = precision_score(perm_labels[:sum(mask_X)], perm_preds[:sum(mask_X)], average='macro', zero_division=0)
perm_prec_Y = precision_score(perm_labels[sum(mask_X):], perm_preds[sum(mask_X):], average='macro', zero_division=0)
perm_diffs.append(perm_prec_X - perm_prec_Y)
perm_diffs = np.array(perm_diffs)
# Compute p-value, conclusion
p_value = np.mean(perm_diffs <= observed_diff)
print(f"P-value: {p_value:.4f}")
print("Fail to reject null hypothesis: No evidence of unfairness detected.")
P-value: 0.1569 Fail to reject null hypothesis: No evidence of unfairness detected.
# Histogram for Distribution of Precisions (South - Northest) Under Null
fig = px.histogram(
perm_diffs,
nbins=50,
title="Distribution of Precision Difference (South - Northwest) Under Null Hypothesis",
labels={"value": "Precision Difference (South - Northwest)"},
width=800,
height=600
)
fig.update_layout(
margin=dict(l=50, r=50, t=50, b=50),
showlegend=False
)
fig.update_layout(
margin=dict(l=50, r=50, t=50, b=50),
showlegend=False,
shapes=[
dict(
type="line",
x0=observed_diff,
x1=observed_diff,
y0=0,
y1=1000,
line=dict(color="red", dash="dash", width=2)
)
],
annotations=[
dict(
x=observed_diff,
y=900,
xref="x",
yref="y",
text="Observed Difference",
showarrow=True,
arrowhead=3,
ax=40,
ay=-40
)
]
)
fig.show()
save_plot(fig, 'distr_precision_diff')
Saved to assets/distr_precision_diff.html